3. Main questions
Question 1: Pedicted GI50-values
As we mentioned in our presentation, we want to create a model to predict GI50-values thus to predict, if Lapatinib is a good choice., The first linear model trys to predict the G-50 value under the data of the doubling time.
Fold_ChangeLap = select(Fold_Change, contains("Lapa"))
NegLogGI50Lap = NegLogGI50[9,]
means = colMeans(Fold_ChangeLap)
Fold_Changemeans = as.data.frame(t(means))
a2 = gsub(x = colnames (Fold_Changemeans), pattern = "_lapatinib_10000nM_24h", replacement = "")
colnames(Fold_Changemeans) = a2
a3 = gsub(x = a2, pattern = "X7", replacement = "7")
colnames(Fold_Changemeans) = a3
a1 = gsub(x = colnames (NegLogGI50Lap), pattern = "-", replacement = ".")
colnames(NegLogGI50Lap) = a1
c1 = rbind(a1,NegLogGI50Lap)
c2 = rbind(a3,Fold_Changemeans)
c1 = t(c1)
c2 = t(c2)
c1 =as.data.frame(c1)
c2 =as.data.frame(c2)
c3 = subset(c1, `1` %in% intersect(c1$`1`, c2$V1))
c4 = as.numeric(as.character(c3$lapatinib))
adjustedNeglogI50Lap = as.data.frame(c4)
Fold_Changemeans = as.data.frame(t(Fold_Changemeans))
combined1 = cbind(adjustedNeglogI50Lap, Fold_Changemeans)
names1 = c( "NegLogI50Lap","Fold_Changemeans")
colnames(combined1) = names1
lmFold = lm(NegLogI50Lap ~ Fold_Changemeans, data = combined1)
summary(lmFold)
##
## Call:
## lm(formula = NegLogI50Lap ~ Fold_Changemeans, data = combined1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0574 -0.4099 -0.1873 0.2076 2.0682
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.464e+00 9.539e-02 57.281 <2e-16 ***
## Fold_Changemeans 1.913e+15 7.519e+14 2.544 0.014 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6822 on 52 degrees of freedom
## Multiple R-squared: 0.1107, Adjusted R-squared: 0.09355
## F-statistic: 6.47 on 1 and 52 DF, p-value: 0.01398
qqnorm(lmFold$residuals, main = "Test for normaldistribution of residuals")
qqline(lmFold$residuals)
plot(combined1$NegLogI50Lap, lmFold$fitted.values, pch = 20, col = "blue", xlab = "Real values",
ylab = "Predicted values", main = "Comparison: real and predicted values ~ linear regression (Fold_Changemeans)")
abline(0, 1, col = "red")
cor(combined1$NegLogI50Lap,combined1$Fold_Changemeans)
## [1] 0.3326477
#Split the data (Training - Testing)
n = nrow(combined1)
rmse1 = sqrt(1/n * sum(lmFold$residuals^2))
rmse1
## [1] 0.6694461
i1.train = sample(1:nrow(combined1), 44)
dat1.train = combined1[i1.train, ]
dat1.test = combined1[-i1.train, ]
l1.train = lm(NegLogI50Lap ~ Fold_Changemeans, data = dat1.train)
summary(l1.train)
##
## Call:
## lm(formula = NegLogI50Lap ~ Fold_Changemeans, data = dat1.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0139 -0.3949 -0.1546 0.1953 1.8922
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.439e+00 1.073e-01 50.685 <2e-16 ***
## Fold_Changemeans 1.584e+15 8.417e+14 1.882 0.0667 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6634 on 42 degrees of freedom
## Multiple R-squared: 0.0778, Adjusted R-squared: 0.05584
## F-statistic: 3.543 on 1 and 42 DF, p-value: 0.06673
n = nrow(dat1.train)
rmse1.train = sqrt(1/n * sum(l1.train$residuals^2))
rmse1.train
## [1] 0.6481528
pred1 = predict(l1.train, newdata = dat1.test)
n = nrow(dat1.test)
residuals = dat1.test$NegLogI50Lap - pred1
rmse1.test1 = sqrt(1/n * sum(residuals^2))
rmse1.test1
## [1] 0.7660867
The second linear model trys to predict the G-50 value under the data of the Foldchange-means.
NegLogGI50Lap = NegLogGI50[9,]
#Sort by Cellline-Name
df = arrange(Cellline_Annotation, Cell_Line_Name)
Doublingtime = cbind.data.frame (df$Cell_Line_Name, df$Doubling_Time)
c21 = as.data.frame(t(NegLogGI50Lap))
combined2 = cbind(c21, Doublingtime$`df$Doubling_Time`)
names2 = c( "NegLogI50Lap","Doubling_Time")
colnames(combined2) = names2
combined2 =na.omit(combined2)
lmDouble = lm(NegLogI50Lap ~ Doubling_Time, data = combined2)
summary(lmDouble)
##
## Call:
## lm(formula = NegLogI50Lap ~ Doubling_Time, data = combined2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2751 -0.4124 -0.1210 0.1709 2.0784
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.147415 0.245361 20.979 <2e-16 ***
## Doubling_Time 0.010536 0.006391 1.649 0.105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6741 on 58 degrees of freedom
## Multiple R-squared: 0.04476, Adjusted R-squared: 0.0283
## F-statistic: 2.718 on 1 and 58 DF, p-value: 0.1046
qqnorm(lmDouble$residuals, main = "Test for normaldistribution of residuals")
qqline(lmDouble$residuals)
plot(combined2$NegLogI50Lap, lmDouble$fitted.values, pch = 20, col = "blue", xlab = "Real values",
ylab = "Predicted values", main = "Comparison: real and predicted values ~ linear regression (Doubling-Time)")
abline(0, 1, col = "red")
cor(combined2$NegLogI50Lap,combined2$Doubling_Time)
## [1] 0.2115772
#Split the data (Training - Testing)
n = nrow(combined2)
rmse2 = sqrt(1/n * sum(lmDouble$residuals^2))
rmse2
## [1] 0.6627233
i2.train = sample(1:nrow(combined2), 48)
dat2.train = combined2[i2.train, ]
dat2.test = combined2[-i2.train, ]
l2.train = lm(NegLogI50Lap ~ Doubling_Time, data = dat2.train)
summary(l2.train)
##
## Call:
## lm(formula = NegLogI50Lap ~ Doubling_Time, data = dat2.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.22894 -0.34126 -0.07953 0.14257 2.12501
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.13037 0.26457 19.392 <2e-16 ***
## Doubling_Time 0.01006 0.00676 1.488 0.144
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6736 on 46 degrees of freedom
## Multiple R-squared: 0.04592, Adjusted R-squared: 0.02518
## F-statistic: 2.214 on 1 and 46 DF, p-value: 0.1436
n = nrow(dat2.train)
rmse2.train = sqrt(1/n * sum(l2.train$residuals^2))
rmse2.train
## [1] 0.6594277
pred2 = predict(l2.train, newdata = dat2.test)
n = nrow(dat1.test)
residuals = dat2.test$NegLogI50Lap - pred2
rmse2.test = sqrt(1/n * sum(residuals^2))
rmse2.test
## [1] 0.7451312
As a last part, we did a multiple regression with both datasets to predict GI50-values.
b1 = gsub(x =Doublingtime$`df$Cell_Line_Name`, pattern = "-", replacement = ".")
Doublingtime1 = rbind(b1,Doublingtime$`df$Doubling_Time`)
Doublingtime1 = as.data.frame(t(Doublingtime1))
c31 = subset(Doublingtime1, b1 %in% intersect(Doublingtime1$b1, c2$V1))
c41 = as.numeric(as.character(c31$V2))
adjustedDoubling_Time = as.data.frame(c41)
combined3 = cbind(adjustedNeglogI50Lap, Fold_Changemeans, adjustedDoubling_Time)
names3 = c( "NegLogI50Lap","Fold_Changemeans","Doubling_Time")
colnames(combined3) = names3
mlr = lm(NegLogI50Lap ~ ., data = combined3)
summary(mlr)
##
## Call:
## lm(formula = NegLogI50Lap ~ ., data = combined3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3123 -0.3909 -0.1226 0.2003 1.8141
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.064e+00 2.655e-01 19.069 <2e-16 ***
## Fold_Changemeans 1.819e+15 7.429e+14 2.449 0.0178 *
## Doubling_Time 1.083e-02 6.717e-03 1.612 0.1130
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6719 on 51 degrees of freedom
## Multiple R-squared: 0.1538, Adjusted R-squared: 0.1206
## F-statistic: 4.635 on 2 and 51 DF, p-value: 0.01415
qqnorm(mlr$residuals, main = "Test for normaldistribution of residuals")
qqline(mlr$residuals)
plot(combined3$NegLogI50Lap, mlr$fitted.values, pch = 20, col = "blue", xlab = "Real values",
ylab = "Predicted values" , main = "Comparison: real and predicted values ~ multiple regression")
abline(0, 1, col = "red")
#Split the data (Training - Testing)
n = nrow(combined3)
rmse3 = sqrt(1/n * sum(mlr$residuals^2))
rmse3
## [1] 0.6530072
i3.train = sample(1:nrow(combined2), 44)
dat3.train = combined3[i3.train, ]
dat3.test = combined3[-i3.train, ]
l3.train = lm(NegLogI50Lap ~ ., data = dat3.train)
summary(l3.train)
##
## Call:
## lm(formula = NegLogI50Lap ~ ., data = dat3.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9238 -0.4782 -0.1381 0.1927 1.6350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.789e+00 3.389e-01 14.132 <2e-16 ***
## Fold_Changemeans 1.150e+15 8.409e+14 1.368 0.1797
## Doubling_Time 2.011e-02 8.917e-03 2.255 0.0301 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6632 on 37 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.1814, Adjusted R-squared: 0.1372
## F-statistic: 4.1 on 2 and 37 DF, p-value: 0.02464
n = nrow(dat3.train)
rmse3.train = sqrt(1/n * sum(l3.train$residuals^2))
rmse3.train
## [1] 0.6081835
pred3 = predict(l3.train, newdata = dat3.test)
n = nrow(dat3.test)
residuals = dat3.test$NegLogI50Lap - pred3
rmse3.test = sqrt(1/n * sum(residuals^2))
rmse3.test
## [1] 0.7577508
As you can see from the data, All three regression models are not really good.
Question 2: Erlotinib vs Lapatinib
# correlation in general
n= as.data.frame(t(NegLogGI50))
rmv.rows = apply(n, 1, function(x) {
sum(is.na(x))
})
NLGI50.all = n[-which(rmv.rows > 0), ] # Removing any row with 1 or more missing values
rm(rmv.rows, n, NegLogGI50)
cor.mat = as.data.frame(cor(NLGI50.all[, 1:ncol(NLGI50.all)], method = "pearson")) #Pearson correlation
round(cor.mat, 2) #round values
## 5-Azacytidine bortezomib cisplatin dasatinib doxorubicin
## 5-Azacytidine 1.00 -0.08 0.16 0.18 0.29
## bortezomib -0.08 1.00 0.01 -0.10 0.32
## cisplatin 0.16 0.01 1.00 -0.24 0.52
## dasatinib 0.18 -0.10 -0.24 1.00 -0.08
## doxorubicin 0.29 0.32 0.52 -0.08 1.00
## erlotinib 0.27 -0.32 0.01 0.42 -0.17
## geldanamycin 0.23 0.36 0.19 -0.09 0.23
## gemcitibine 0.16 -0.08 0.53 -0.03 0.37
## lapatinib 0.14 -0.26 -0.07 0.19 -0.16
## paclitaxel 0.10 0.20 0.01 -0.10 0.55
## sirolimus -0.05 0.01 0.27 0.07 0.17
## sorafenib 0.09 0.27 -0.01 -0.24 0.14
## sunitinib 0.12 -0.01 -0.05 -0.03 -0.14
## topotecan 0.14 0.13 0.55 0.02 0.60
## vorinostat 0.16 -0.02 0.07 -0.16 -0.06
## erlotinib geldanamycin gemcitibine lapatinib paclitaxel
## 5-Azacytidine 0.27 0.23 0.16 0.14 0.10
## bortezomib -0.32 0.36 -0.08 -0.26 0.20
## cisplatin 0.01 0.19 0.53 -0.07 0.01
## dasatinib 0.42 -0.09 -0.03 0.19 -0.10
## doxorubicin -0.17 0.23 0.37 -0.16 0.55
## erlotinib 1.00 -0.01 0.01 0.65 -0.37
## geldanamycin -0.01 1.00 0.12 -0.01 0.28
## gemcitibine 0.01 0.12 1.00 -0.15 0.03
## lapatinib 0.65 -0.01 -0.15 1.00 -0.24
## paclitaxel -0.37 0.28 0.03 -0.24 1.00
## sirolimus 0.21 -0.21 0.05 0.21 -0.04
## sorafenib -0.29 0.14 -0.01 -0.25 0.29
## sunitinib 0.06 0.24 0.06 0.12 -0.02
## topotecan -0.02 0.21 0.63 -0.14 0.20
## vorinostat 0.12 0.20 0.18 0.26 0.09
## sirolimus sorafenib sunitinib topotecan vorinostat
## 5-Azacytidine -0.05 0.09 0.12 0.14 0.16
## bortezomib 0.01 0.27 -0.01 0.13 -0.02
## cisplatin 0.27 -0.01 -0.05 0.55 0.07
## dasatinib 0.07 -0.24 -0.03 0.02 -0.16
## doxorubicin 0.17 0.14 -0.14 0.60 -0.06
## erlotinib 0.21 -0.29 0.06 -0.02 0.12
## geldanamycin -0.21 0.14 0.24 0.21 0.20
## gemcitibine 0.05 -0.01 0.06 0.63 0.18
## lapatinib 0.21 -0.25 0.12 -0.14 0.26
## paclitaxel -0.04 0.29 -0.02 0.20 0.09
## sirolimus 1.00 -0.11 -0.20 0.03 0.02
## sorafenib -0.11 1.00 0.05 0.16 0.10
## sunitinib -0.20 0.05 1.00 -0.05 -0.13
## topotecan 0.03 0.16 -0.05 1.00 0.08
## vorinostat 0.02 0.10 -0.13 0.08 1.00
produce pairwise scatter plots
pairs(NLGI50.all[, 1:ncol(NLGI50.all)], pch = 20, cex = 0.8, col = "royalblue3", main = "Correlation_NegLogGI50")
matrix of correlations
cor = cor(NLGI50.all)
pheatmap(cor, col = cm.colors(256), main = "Pheatmap correlation")
Erlotinib and lapatinib have a strong correlation.
plot erlotinib all genes, coloured by tissue
#differece
diff = data.frame(erlotinib = NLGI50.all$erlotinib - mean(NLGI50.all$erlotinib), lapatinib = NLGI50.all$lapatinib- mean(NLGI50.all$lapatinib))
diff$celllines = rownames(NLGI50.all)
#create vector to insert column tissue from Metadata
tissue = sapply(1:nrow(diff), function(x) {
position = which(as.character(Metadata$cell) == diff[x, "celllines"])[1] #if tissue occurs several times, take the first
out = as.character(Metadata[position, "tissue"]) #output the tissue at this position
return(out)
})
diff$tissue = tissue
rm(tissue)
diff$celllines = factor(diff$celllines, levels = diff$celllines[order(diff$tissue)]) #Classified by tissue
ggplot(diff, aes(x = celllines, y = erlotinib, fill = tissue))+geom_bar(stat = "identity") + coord_flip() + labs(title = "Mean graph plot of NLGI50 values for Erlotinib")
The difference from the NegLogGI50 for a particular cell line and the mean NegLogGI50 is plotted here for Erlotinib.
plot lapatinib all genes, coloured by tissue
ggplot(diff, aes(x = celllines, y = lapatinib, fill = tissue)) + geom_bar(stat="identity") + coord_flip() + labs(title="Mean graph plot of NLGI50 values for Lapatinib")
The difference from the NegLogGI50 for a particular cell line and the mean NegLogGI50 is plotted here for Lapatinib.
correlation erlotinib , lapatinib
cor(NLGI50.all$erlotinib, NLGI50.all$lapatinib, method = "pearson")
## [1] 0.6528188
A Pearson correlation coefficient of ~ 0.65 confirms that these patterns are similar.
Lung genes
#only lung with mean all
### load data
Metadata_Lapatinib_treated = Metadata[which(Metadata$drug == "lapatinib" & Metadata$dose != "0nM"),]
NegLogGI50 = as.data.frame(readRDS(paste0(wd, "/Data/NegLogGI50.rds")))
#lung genes from Metadata
Lung_Metadata_L_treated = Metadata[which(Metadata$drug == "lapatinib" & Metadata$dose != "0nM" & Metadata$tissue == "Lung"),]
celllines = Lung_Metadata_L_treated$cell
NegLogGI50.lung = as.data.frame(t(NegLogGI50[c("erlotinib", "lapatinib"), celllines]))
#Difference
dif.NegLogGI50.lung = data.frame(erlotinib = NegLogGI50.lung$erlotinib - mean(NLGI50.all$erlotinib), lapatinib = NegLogGI50.lung$lapatinib - mean(NLGI50.all$lapatinib)) #erlotinib data - mean value, lapatinib data - mean value
dif.NegLogGI50.lung$celllines = rownames(NegLogGI50.lung)
# PLot
ggplot(dif.NegLogGI50.lung,aes(x = celllines, y = erlotinib)) + geom_bar(stat = "identity", fill = "skyblue") + geom_text(aes(label = round(erlotinib, 2)), vjust = -0.5, color = "black", size = 3) + coord_flip() + labs(title = "Mean graph plot of NLGI50 values for Erlotinib, only Lung genes")
plot lapatinib
ggplot(dif.NegLogGI50.lung,aes(x = celllines, y = lapatinib)) + geom_bar(stat = "identity", fill = "skyblue") + geom_text(aes(label=round(lapatinib, 2)), vjust = -0.5, color = "black", size = 3) + coord_flip() + labs(title = "Mean graph plot of NLGI50 values for Lapatinib, only Lung genes")
correlation lung
cor(NegLogGI50.lung$erlotinib, NegLogGI50.lung$lapatinib, method = "pearson")
## [1] 0.9609488
A pearson correlation coefficent of ~ 0.96 suggests that Lapatinib has a similar effect on lung cancer as Erlotinib
Anova
selection of Lapatinib and Erlotinib treated cells
lapa<-data.frame(Metadata[which(Metadata[,'drug'] == "lapatinib"), ])
erlo<-data.frame(Metadata[which(Metadata[,'drug'] == "erlotinib"), ])
el<-right_join(lapa,erlo, by="cell")
rmv.rows = apply(el, 1, function(x) {
sum(is.na(x))
}) # Go through each row and sum up all missing values
row.names(rmv.rows)
Create data frame with lapatinib and erlotinib data
fc<-data.frame(Treated-Untreated)
dim(fc)
## [1] 13299 819
all<-data.frame(fc[grep("lapatinib|erlotinib", colnames(fc))])
dim(all)
## [1] 13299 113
since erlotinip contains more columns than lapatinib, we have to remove these columns
all.rmv<-data.frame(all %>% select(
-"CCRF.CEM_erlotinib_10000nM_24h",
-"HL.60_erlotinib_10000nM_24h",
-"HT29_erlotinib_10000nM_24h",
-"K.562_erlotinib_10000nM_24h",
-"LOX_erlotinib_10000nM_24h",
-"SR_erlotinib_10000nM_24h",
-"COLO.205_lapatinib_10000nM_24h"))
dim(all.rmv)
## [1] 13299 106
Checking the rows
la<-data.frame(all.rmv[grep("lapatinib", colnames(all.rmv))])
ncol(la)
## [1] 53
er<-data.frame(all.rmv[grep("erlotinib", colnames(all.rmv))])
ncol(er)
## [1] 53
erla<-data.frame(er,la)
ncol(all.rmv) #to prove if the columns are removed
## [1] 106
Anova
drug<-c(rep('Erlotinib',53), rep('Lapatinib',53))
expression_drug<-apply(erla, MARGIN = 2, sum)
df_drug<-data.frame(expression_drug, drug)
library(ggpubr)
ggboxplot (data = df_drug, x="drug", y="expression_drug", color = "drug",
add = "jitter", legend = "none")+
rotate_x_text(angle = 45)+
stat_compare_means(method = "anova")+ # Add global annova p-value
stat_compare_means(label = "p.signif", method = "t.test",
ref.group = ".all.", hide.ns = TRUE) # Pairwise comparison against all
Question 3: Comparing lapatinib treated breast and cns celllines
L_fc <- select(Fold_Change, contains("Lapa"))
L_fc <- as.data.frame(t(L_fc))
rownames(Metadata) <- Metadata$sample
L_treated <- select(Treated, contains("Lapa"))
L_treated <- t(L_treated)
L_untreated <- select(Untreated, contains("Lapa"))
L_untreated <- t(L_untreated)
# selecting breast Lapatinib samples
breast <- Metadata[Metadata[,'tissue']=="Breast",]
rownames(breast) <- breast$sample
rownames(breast) <- gsub(x = rownames(breast), pattern = "-", replacement = ".")
breastFC <- subset(L_fc, rownames(L_fc) %in% rownames(breast))
breastTreated <- subset(L_treated, rownames(L_treated) %in% rownames(breast))
breastUntreated <- subset(L_untreated, rownames(L_untreated) %in% rownames(breast))#
# selecting CNS Lapatinib samples
cns <- Metadata[Metadata[,'tissue']=="CNS",]
rownames(cns) <- cns$sample
rownames(cns) <- gsub(x = rownames(cns), pattern = "-", replacement = ".")
cnsFC <- subset(L_fc, rownames(L_fc) %in% rownames(cns))
cnsTreated <- subset(L_treated, rownames(L_treated) %in% rownames(cns))
cnsUntreated <- subset(L_untreated, rownames(L_untreated) %in% rownames(cns))
#performing a paired t-test of treated and untreated samples
t_test_cns <- col_t_paired(cnsTreated, cnsUntreated, alternative = "two.sided", mu = 0,conf.level = 0.95)
t_test_breast <- col_t_paired(breastTreated, breastUntreated, alternative = "two.sided", mu = 0,conf.level = 0.95)
#obtaining Benjamini-Hochberg adjusted p-values
pval_cns <- t_test_cns$pvalue
pval_breast <- t_test_breast$pvalue
fdr_cns <- p.adjust(pval_cns, "BH")
fdr_breast <- p.adjust(pval_breast, "BH")
#obtaining mean FC values over all samples
breastFCm <- as.numeric(colMeans(breastFC))
cnsFCm <- as.numeric(colMeans(cnsFC))
genes <- colnames(breastFC)
## breast volcano plot
#creating a matrix containg all needed values for plotting
diff_df_breast <- data.frame(gene = genes, Fold = breastFCm, FDR = fdr_breast)
diff_df_breast$absFold <- abs(diff_df_breast$Fold)
head(diff_df_breast)
## gene Fold FDR absFold
## 1 A1CF 0.037268413 0.8765540 0.037268413
## 2 A2M -0.032213825 0.7188608 0.032213825
## 3 A4GALT 0.006012452 0.9793436 0.006012452
## 4 A4GNT -0.053969518 0.4235638 0.053969518
## 5 AAAS 0.081656784 0.5283372 0.081656784
## 6 AACS 0.023767096 0.8115022 0.023767096
# add a grouping column; default value is "not significant"
diff_df_breast$group <- "NotSignificant"
# change the grouping for the entries with significance but not a large enough Fold change
diff_df_breast[which(diff_df_breast['FDR'] < 0.5 & (diff_df_breast['absFold']) < 0.2 ),"group"] <- "Significant"
# change the grouping for the entries a large enough Fold change but not a low enough p value
diff_df_breast[which(diff_df_breast['FDR'] > 0.5 & (diff_df_breast['absFold']) > 0.2 ),"group"] <- "FoldChange"
# change the grouping for the entries with both significance and large enough fold change
diff_df_breast[which(diff_df_breast['FDR'] < 0.5 & (diff_df_breast['absFold']) > 0.2 ),"group"] <- "Significant&FoldChange"
# Find and label the top peaks.
top_peaks_breast <- diff_df_breast[with(diff_df_breast, order(Fold, FDR)),][1:10,]
top_peaks_breast <- rbind(top_peaks_breast, diff_df_breast[with(diff_df_breast, order(-Fold, FDR)),][1:10,])
# Add gene labels for all of the top genes we found
# creating an empty list, and filling it with entries for each row in the dataframe
# each list entry is another list with named items that will be used
a <- list()
for (i in seq_len(nrow(top_peaks_breast))) {
m <- top_peaks_breast[i, ]
a[[i]] <- list(
x = m[["Fold"]],
y = -log10(m[["FDR"]]),
text = m[["gene"]],
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 0.5,
ax = 20,
ay = -40
)
}
plot_breast <- plot_ly(data = diff_df_breast, x = diff_df_breast$Fold, y = -log10(diff_df_breast$FDR), type = "scatter", text = diff_df_breast$gene, mode = "markers", color = diff_df_breast$group) %>%
layout(title ="Volcano Plot of Lapatinib breast cancer samples",
xaxis = list(title="log2 Fold Change"),
yaxis = list(title="FDR")) %>%
layout(annotations = a)
plot_breast
###thresholds still need to be discussed
## CNS volcano plot
diff_df_cns <- data.frame(gene = genes, Fold = cnsFCm, FDR = fdr_cns)
diff_df_cns$absFold <- abs(diff_df_cns$Fold)
head(diff_df_cns)
## gene Fold FDR absFold
## 1 A1CF 0.066575311 0.5939566 0.066575311
## 2 A2M 0.038348381 0.6873009 0.038348381
## 3 A4GALT 0.000390011 0.9980719 0.000390011
## 4 A4GNT -0.018219799 0.8780106 0.018219799
## 5 AAAS 0.014723327 0.9008420 0.014723327
## 6 AACS 0.003887384 0.9870209 0.003887384
# add a grouping column; default value is "not significant"
diff_df_cns$group <- "NotSignificant"
# change the grouping for the entries with significance but not a large enough Fold change
diff_df_cns[which(diff_df_cns['FDR'] < 0.5 & (diff_df_cns['absFold']) < 0.2 ),"group"] <- "Significant"
# change the grouping for the entries a large enough Fold change but not a low enough p value
diff_df_cns[which(diff_df_cns['FDR'] > 0.5 & (diff_df_cns['absFold']) > 0.2 ),"group"] <- "FoldChange"
# change the grouping for the entries with both significance and large enough fold change
diff_df_cns[which(diff_df_cns['FDR'] < 0.5 & (diff_df_cns['absFold']) > 0.2 ),"group"] <- "Significant&FoldChange"
# Find and label the top peaks..
top_peaks_cns <- diff_df_cns[with(diff_df_cns, order(Fold, FDR)),][1:10,]
top_peaks_cns <- rbind(top_peaks_cns, diff_df_cns[with(diff_df_cns, order(-Fold, FDR)),][1:10,])
a <- list()
for (i in seq_len(nrow(top_peaks_cns))) {
m <- top_peaks_cns[i, ]
a[[i]] <- list(
x = m[["Fold"]],
y = -log10(m[["FDR"]]),
text = m[["gene"]],
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 0.5,
ax = 20,
ay = -40
)
}
plot_cns <- plot_ly(data = diff_df_cns, x = diff_df_cns$Fold, y = -log10(diff_df_cns$FDR),type = "scatter", text = diff_df_cns$gene, mode = "markers", color = diff_df_cns$group) %>%
layout(title ="Volcano Plot of Lapatinib CNS cancer samples",
xaxis = list(title="log2 Fold Change"),
yaxis = list(title="FDR"))%>%
layout(annotations = a)
plot_cns
# selecet top peak genes common in cns and breast tissue
tpb_comparison <- subset(top_peaks_breast, gene %in% top_peaks_cns$gene)
tpc_comparison <- subset(top_peaks_cns, gene %in% top_peaks_breast$gene)
# order common genes alphabetically
tpb_comparison <- tpb_comparison[order(tpb_comparison$gene),]
tpc_comparison <- tpc_comparison[order(tpc_comparison$gene),]
## creating heat map of FCs to compare values
cor_mat <- cbind("breast" = tpb_comparison$Fold, "cns" = tpc_comparison$Fold)
rownames(cor_mat) <- tpb_comparison$gene
data <- read.delim
pheatmap(
mat = cor_mat,
color = magma(10),
border_color = "black",
show_colnames = TRUE,
show_rownames = TRUE,
drop_levels = TRUE,
fontsize = 14,
main = "Comparison:
FC levels of cns and breast top peak genes"
)
#correlation between top peak gene expression patterns in breast and cns tissues treated by lapatinib
cor_mat <- as.data.frame(cor_mat)
dif.FC.BC = data.frame(breast = cor_mat$breast - mean(t(breastFC)), cns = cor_mat$cns - mean(t(cnsFC))) #breast data - mean value, cns data - mean value
dif.FC.BC$gene = rownames(cor_mat)
# PLot
ggplot(dif.FC.BC,aes(x = gene, y = breast)) +
geom_bar(stat = "identity", fill = "skyblue") +
geom_text(aes(label = round(breast, 2)), vjust = -0.5, color = "black", size = 3) +
coord_flip() + labs(title = "Mean graph plot of Fold Change for breast top peak genes")
ggplot(dif.FC.BC,aes(x = gene, y = cns)) +
geom_bar(stat = "identity", fill = "skyblue") +
geom_text(aes(label = round(cns, 2)), vjust = -0.5, color = "black", size = 3) +
coord_flip() + labs(title = "Mean graph plot of Fold Change for CNS top peak genes")
#correlation
cor(cor_mat$breast, cor_mat$cns, method = "pearson")
## [1] 0.921812